library(caret)

merged_df <- read.csv("merged_final.csv")
merged_df <- merged_df[merged_df$FAMILY_INCOME > 10000, ]
merged_df <- merged_df[!is.na(merged_df$FAMILY_INCOME),]

essay_content <- log(merged_df[,6:75])
essay_style <- merged_df[,76:167]

essay_content$inc <- merged_df$FAMILY_INCOME
essay_content$SAT <- merged_df$RSAT_TOTAL_SCORE

essay_style$inc <- merged_df$FAMILY_INCOME
essay_style$SAT <- merged_df$RSAT_TOTAL_SCORE

# Essay content and style have stronger relationships to income than SAT score
# Final models used in paper drop least correlated topic but are left here

print(paste("Adj. R-squared for SAT score predicting income is:",
summary(lm(FAMILY_INCOME ~ RSAT_TOTAL_SCORE, data = merged_df))$adj.r.squared))

topic_folds <- trainControl(method = "cv", number = 10)

topic_mod <- train(inc ~ ., method = "lm",
                    data = essay_content, trControl = topic_folds)

dict_folds <- trainControl(method = "cv", number = 10)

dict_mod <- train(inc ~ ., method = "lm",
                   data = essay_style, trControl = dict_folds)

print(topic_mod)
print(dict_mod)

# Essay content and style are strong predictors of SAT score

topic_folds_sat <- trainControl(method = "cv", number = 10)

topic_mod_sat <- train(SAT ~ ., method = "lm",
                   data = essay_content, trControl = topic_folds)

dict_folds_sat <- trainControl(method = "cv", number = 10)

dict_mod_sat <- train(SAT ~ ., method = "lm",
                  data = essay_style, trControl = dict_folds)

print(topic_mod_sat)
print(dict_mod_sat)

